# HKPPW
# Clark and Linzer replication with additional statistics from the MC results
# ----------------------------#
set.seed(1138183)

library(lme4)
library(MASS)
library(matrixStats)

# Set working directory
setwd("")

dgp <- function(truecor, nperunit, b1, x.sd, y.sd) {
  nunits <- length(nperunit)
  v <- mvrnorm(nunits, c(0, 0), matrix(c(1, truecor, truecor, 1), 2, 2))
  b0 <- v[, 1]
  xmean <- v[, 2]
  dat <- NULL
  for (j in 1:nunits) {
    x <- rnorm(nperunit[j], xmean[j], x.sd)
    y <- b0[j]  +  b1*x  +  rnorm(nperunit[j], 0, y.sd)
    dat <- rbind(dat, cbind(x, y, unit=j))
  }
  dat <- data.frame(dat)
  return(list(dat=dat, b0=b0, xmean=xmean))  
}

## Parameters to manipulate in simulation:
## [1] correlation between x and unit effect
## [2] number of units
## [3] sample size per unit
## [4] strength of association between x and y
## [5] how "spread apart" or dissimilar the units are, controlled by 
##     increasing or decreasing x.sd; smaller values make units more
##     dissimilar and farther apart from one another. This can be
##     "corrected" by making y.sd smaller, so there's less noise in 
##     the within-unit relationship given the amount of variation in x.
##     Making the variation in x small is akin to having "slow-moving" 
##     IVs in TSCS data.

truecor <- c(seq(0, 0.9, 0.1), 0.95)
units <- c(10, 40, 100)
Nperunit <- c(5, 20, 50)

b1 <- 1 # also try b1 = 0, 0.5, 2
#x.sd <- 0.2 # sluggish
x.sd <- 1 # standard
y.sd <- 1

coeflist.fe <- list()
coeflist.re <- list()
coeflist.pool <- list()
selist.fe <- list()
selist.re <- list()
selist.pool <- list()
ullist.fe <- list()
lllist.fe <- list()
ullist.re <- list()
lllist.re <- list()
ullist.pool <- list()
lllist.pool <- list()

corlist <- list()

nsim <- 2000 #   (C&L use s = 2000)
mc.time <- proc.time() # start MC timer

index <- 1
for (unitloop in units) {
  for (Nperloop in Nperunit) {
    cat("\nunits:", unitloop, "  Nperunit:", Nperloop, "  ", 
        as.character(Sys.time()), "\n")
    coefmat.fe <- NULL
    coefmat.re <- NULL
    coefmat.pool <- NULL
    semat.fe <- NULL
    semat.re <- NULL
    semat.pool <- NULL
    ulmat.fe <- NULL
    llmat.fe <- NULL
    ulmat.re <- NULL
    llmat.re <- NULL
    ulmat.pool <- NULL
    llmat.pool <- NULL
    
    for (i in 1:length(truecor)) {
      cat("\t correlation:", truecor[i], "\n")
      flush.console()
      coef.fe <- NULL
      coef.re <- NULL
      coef.pool <- NULL
      se.fe <- NULL
      se.re <- NULL
      se.pool <- NULL
      ul.fe <- NULL
      ll.fe <- NULL
      ul.re <- NULL
      ll.re <- NULL
      ul.pool <- NULL
      ll.pool <- NULL
      for (k in 1:nsim) {
        # create simulated data set
        draw <- dgp(truecor[i], rep(Nperloop, unitloop), b1, x.sd, y.sd)
        ## or make unbalanced panels
        #sampsize <- round(rnorm(unitloop, Nperloop, Nperloop/2))
        #sampsize[sampsize < 2] <- 2
        #draw <- dgp(truecor[i], sampsize, b1, x.sd, y.sd)
        ## end unbalanced panels
        dat <- draw$dat
        # estimate FE, RE, and pooled models
        reg.fe <- lm(y~x + as.factor(unit)-1, dat)
        reg.re <- lmer(y~x + (1|unit), dat)
        reg.pool <- lm(y~x, dat)
        #reg.rc <- lmer(y~x + (1 + x|unit), dat)
        # save estimates of coefficients on x
        coef.fe <- c(coef.fe, coefficients(reg.fe)[1])
        coef.re <- c(coef.re, fixef(reg.re)[2])
        coef.pool <- c(coef.pool, coefficients(reg.pool)[2])
        
        # Save estimates of standard error
        se.fe <- c(se.fe, summary(reg.fe)$coefficients[1,2])
        se.re <- c(se.re, summary(reg.re)$coefficients[2,2])
        se.pool <- c(se.pool, summary(reg.pool)$coefficients[2,2])
        
        # save UL and LL estimates as well:
        ll.fe <- c(ll.fe, confint(reg.fe, parm = c(1), level = 0.95)[1]) # FE LL
        ul.fe <- c(ul.fe, confint(reg.fe, parm = c(1), level = 0.95)[2]) # FE UL
        ll.re <- c(ll.re, confint(reg.re, parm = c(4), level =0.95)[1]) # RE LL
        ul.re <- c(ul.re, confint(reg.re, parm = c(4), level =0.95)[2]) # RE UL
        ll.pool <- c(ll.pool, confint(reg.pool, parm = c(2), level = 0.95)[1]) # pool LL
        ul.pool <- c(ul.pool, confint(reg.pool, parm = c(2), level = 0.95)[2]) # pool UL
        
      }
      coefmat.fe <- cbind(coefmat.fe, coef.fe)
      coefmat.re <- cbind(coefmat.re, coef.re)
      coefmat.pool <- cbind(coefmat.pool, coef.pool)
      semat.fe <- cbind(semat.fe, se.fe)
      semat.re <- cbind(semat.re, se.re)
      semat.pool <- cbind(semat.pool, se.pool)
      llmat.fe <- cbind(llmat.fe, ll.fe)
      ulmat.fe <- cbind(ulmat.fe, ul.fe)
      llmat.re <- cbind(llmat.re, ll.re)
      ulmat.re <- cbind(ulmat.re, ul.re)
      llmat.pool <- cbind(llmat.pool, ll.pool)
      ulmat.pool <- cbind(ulmat.pool, ul.pool)
    }
    coeflist.fe[[index]] <- coefmat.fe
    coeflist.re[[index]] <- coefmat.re
    coeflist.pool[[index]] <- coefmat.pool
    selist.fe[[index]] <- semat.fe
    selist.re[[index]] <- semat.re
    selist.pool[[index]] <- semat.pool
    lllist.fe[[index]] <- llmat.fe
    ullist.fe[[index]] <- ulmat.fe
    lllist.re[[index]] <- llmat.re
    ullist.re[[index]] <- ulmat.re
    lllist.pool[[index]] <- llmat.pool
    ullist.pool[[index]] <- ulmat.pool
    index <- index + 1
  }
}
(proc.time() - mc.time)[3]/60 # elapsed time (in minutes)
# about 37 minutes for 100 sims
# about 8.5 hours for 2000 sims
# 

# Container of results:
sim.result <- list(coeflist.fe = coeflist.fe, 
                   coeflist.re = coeflist.re, 
                   coeflist.pool = coeflist.pool,
                   selist.fe = selist.fe, 
                   selist.re = selist.re, 
                   selist.pool = selist.pool,
                   lllist.fe = lllist.fe,
                   ullist.fe = ullist.fe,
                   lllist.re = lllist.re,
                   ullist.re = ullist.re,
                   lllist.pool = lllist.pool,
                   ullist.pool = ullist.pool)

save.image("2000sims_standard.RData")

## Plot RMSE of slope estimate
## FE=solid line, RE=dashed line, pooled=dotted line
#quartz(width=9, height=8)
pdf("CL_RMSE_Standard.pdf", width=9, height=8)
par(mfrow=c(length(units), length(Nperunit)), mar=c(4, 4, 2, 0.5), las=1)
index <- 1
for (unitloop in units) {
  for (Nperloop in Nperunit) {
    rmse.fe <- sqrt(colMeans((coeflist.fe[[index]] - b1)^2))
    rmse.re <- sqrt(colMeans((coeflist.re[[index]] - b1)^2))
    rmse.pool <- sqrt(colMeans((coeflist.pool[[index]] - b1)^2))
    plot(truecor, rmse.fe, pch=19, type="l", col = "#80b1d3", lwd = 3, ylim=c(0, 1), xlim=c(0, 1), 
         xlab=expression(rho), ylab="RMSE", 
         main=paste(unitloop, "units, ", Nperloop, "observations per unit"))
    lines(truecor, rmse.re, col = "#fb8072", lwd = 3, lty=2, pch=19  )
    lines(truecor, rmse.pool, col = "#bebada", lwd = 3, pch=19, lty=3)
    index <- index + 1
  }
}
dev.off()
#quartz.save("CL_RMSE_Standard", type = "pdf")

# Plot mean bias:
## FE=solid line, RE=dashed line, pooled=dotted line
#quartz(width=9, height=8)
pdf("CL_Bias_Standard.pdf", width=9, height=8)
par(mfrow=c(length(units), length(Nperunit)), mar=c(4, 4, 2, 0.5), las=1)
index <- 1
for (unitloop in units) {
  for (Nperloop in Nperunit) {
    mab.fe <- colMeans(coeflist.fe[[index]] - b1)
    mab.re <- colMeans(coeflist.re[[index]] - b1)
    mab.pool <- colMeans(coeflist.pool[[index]] - b1)
    plot(truecor, mab.fe, pch=19, type="l", col = "#80b1d3", lwd = 3, ylim=c(0, 1), xlim=c(0, 1), 
         xlab=expression(rho), ylab="Bias", 
         main=paste(unitloop, "units, ", Nperloop, "observations per unit"))
    lines(truecor, mab.re, pch=19, col = "#fb8072", lwd = 3, lty=2)
    lines(truecor, mab.pool, pch=19, col = "#bebada", lwd = 3, lty=3)
    index <- index + 1
  }
}
dev.off()
#quartz.save("CL_Bias_Standard", type = "pdf")

# Plot standard deviation:
## FE=solid line, RE=dashed line, pooled=dotted line
#quartz(width=9, height=8)
pdf("CL_SD_Standard.pdf", width=9, height=8)
par(mfrow=c(length(units), length(Nperunit)), mar=c(4, 4, 2, 0.5), las=1)
index <- 1
for (unitloop in units) {
  for (Nperloop in Nperunit) {
    sd.fe <- colSds(coeflist.fe[[index]])
    sd.re <- colSds(coeflist.re[[index]])
    sd.pool <- colSds(coeflist.pool[[index]])
    plot(truecor, sd.fe, pch=19, type="l", col = "#80b1d3", lwd = 3, ylim=c(0, 1), xlim=c(0, 1), 
         xlab=expression(rho), ylab="SD", 
         main=paste(unitloop, "units, ", Nperloop, "observations per unit"))
    lines(truecor, sd.re, pch=19, col = "#fb8072", lwd = 3, lty=2)
    lines(truecor, sd.pool, pch=19, col = "#bebada", lwd = 3, lty=3)
    index <- index + 1
  }
}
dev.off()
#quartz.save("CL_SD_Standard", type = "pdf")

# Plot power (1-TypeII error):
## FE=solid line, RE=dashed line, pooled=dotted line
#quartz(width=9, height=8)
pdf("CL_Power_Standard.pdf", width=9, height=8)
par(mfrow=c(length(units), length(Nperunit)), mar=c(4, 4, 2, 0.5), las=1)
index <- 1
for (unitloop in units) {
  for (Nperloop in Nperunit) {
    power.fe <- colMeans(ifelse(lllist.fe[[index]] > 0 | ullist.fe[[index]] < 0, 1, 0))
    power.re <- colMeans(ifelse(lllist.re[[index]] > 0 | ullist.re[[index]] < 0, 1, 0))
    power.pool <- colMeans(ifelse(lllist.pool[[index]] > 0 | ullist.pool[[index]] < 0, 1, 0))
    plot(truecor, power.fe, pch=19, type="l", col = "#80b1d3", lwd = 3, ylim=c(0, 1), xlim=c(0, 1), 
         xlab=expression(rho), ylab="Power", 
         main=paste(unitloop, "units, ", Nperloop, "observations per unit"))
    lines(truecor, power.re, pch=19,  col = "#fb8072", lwd = 3, lty=2)
    lines(truecor, power.pool, pch=19, col = "#bebada", lwd = 3, lty=3)
    index <- index + 1
  }
}
dev.off()
#quartz.save("CL_Power_Standard", type = "pdf")

# Plot coverage (how often do our 95% CI's reject beta= truth?):
## FE=solid line, RE=dashed line, pooled=dotted line
#quartz(width=9, height=8)
pdf("CL_Coverage_Standard.pdf", width=9, height=8)
par(mfrow=c(length(units), length(Nperunit)), mar=c(4, 4, 2, 0.5), las=1)
index <- 1
for (unitloop in units) {
  for (Nperloop in Nperunit) {
    coverage.fe <- colMeans(ifelse(lllist.fe[[index]] > b1 | ullist.fe[[index]] < b1, 0, 1))
    coverage.re <- colMeans(ifelse(lllist.re[[index]] > b1 | ullist.re[[index]] < b1, 0, 1), na.rm = T)
    coverage.pool <- colMeans(ifelse(lllist.pool[[index]] > b1 | ullist.pool[[index]] < b1, 0, 1))
    plot(truecor, coverage.fe, pch=19, type="l", col = "#80b1d3", lwd = 3, ylim=c(0, 1), xlim=c(0, 1), 
         xlab=expression(rho), ylab="Coverage", 
         main=paste(unitloop, "units, ", Nperloop, "observations per unit"))
    lines(truecor, coverage.re, pch=19, col = "#fb8072", lwd = 3, lty=2)
    lines(truecor, coverage.pool, pch=19, col = "#bebada", lwd = 3, lty=3)
    index <- index + 1
  }
}
dev.off()
#quartz.save("CL_Coverage_Standard", type = "pdf")

# Plot Optimism

## FE=solid line, RE=dashed line, pooled=dotted line
#quartz(width=9, height=8)
pdf("CL_Optimism_Standard.pdf", width=9, height=8)
par(mfrow=c(length(units), length(Nperunit)), mar=c(4, 4, 2, 0.5), las=1)
index <- 1
for (unitloop in units) {
  for (Nperloop in Nperunit) {
    sd.fe <- colSds(coeflist.fe[[index]])
    sd.re <- colSds(coeflist.re[[index]])
    sd.pool <- colSds(coeflist.pool[[index]])
    se.fe <- colMeans(selist.fe[[index]])
    se.re <- colMeans(selist.re[[index]])
    se.pool <- colMeans(selist.pool[[index]])
    # opt.fe <- se.fe/sd.fe
    # opt.re <- se.re/sd.re
    # opt.pool <- se.pool/sd.pool
    opt.fe <- sd.fe/se.fe
    opt.re <- sd.re/se.re
    opt.pool <- sd.pool/se.pool
    
    plot(truecor, opt.fe, pch=19, type="l", col = "#80b1d3", lwd = 3, ylim=c(0, 4), xlim=c(0, 1), 
         xlab=expression(rho), ylab="Overconfidence", 
         main=paste(unitloop, "units, ", Nperloop, "observations per unit"))
    lines(truecor, opt.re, pch=19, col = "#fb8072", lwd = 3, lty=2)
    lines(truecor, opt.pool, pch=19, col = "#bebada", lwd = 3, lty=3)
    index <- index + 1
  }
}
dev.off()
#quartz.save("CL_Optimism_Standard", type = "pdf")

# normality 
normal.pool <- matrix(data = NA, nrow = 99, ncol = 6)
colnames(normal.pool) <- c("units", "within_units", "corr", "p_value", "normal_90", "normal_95")
normal.fe <- matrix(data = NA, nrow = 99, ncol = 6)
colnames(normal.fe) <- c("units", "within_units", "corr", "p_value", "normal_90", "normal_95")
normal.re <- matrix(data = NA, nrow = 99, ncol = 6)
colnames(normal.re) <- c("units", "within_units", "corr", "p_value", "normal_90", "normal_95")
index <- 1
j <- 1
for (unitloop in units) {
  for (Nperloop in Nperunit) {
    for (i in 1:length(truecor)) {
      # pooled
      normal.pool[index, 1] <- unitloop
      normal.pool[index, 2] <- Nperloop
      normal.pool[index, 3] <- truecor[i]
      normal.pool[index, 4] <- shapiro.test(coeflist.pool[[j]][ , i])$p.value
      normal.pool[index, 5] <- ifelse(shapiro.test(coeflist.pool[[j]][ , i])$p.value > 0.1, "normal", "not normal")
      normal.pool[index, 6] <- ifelse(shapiro.test(coeflist.pool[[j]][ , i])$p.value > 0.05, "normal", "not normal")
      
      # fe
      normal.fe[index, 1] <- unitloop
      normal.fe[index, 2] <- Nperloop
      normal.fe[index, 3] <- truecor[i]
      normal.fe[index, 4] <- shapiro.test(coeflist.fe[[j]][ , i])$p.value
      normal.fe[index, 5] <- ifelse(shapiro.test(coeflist.fe[[j]][ , i])$p.value > 0.1, "normal", "not normal")
      normal.fe[index, 6] <- ifelse(shapiro.test(coeflist.fe[[j]][ , i])$p.value > 0.05, "normal", "not normal")
      
      # re
      normal.re[index, 1] <- unitloop
      normal.re[index, 2] <- Nperloop
      normal.re[index, 3] <- truecor[i]
      normal.re[index, 4] <- shapiro.test(coeflist.re[[j]][ , i])$p.value
      normal.re[index, 5] <- ifelse(shapiro.test(coeflist.re[[j]][ , i])$p.value > 0.1, "normal", "not normal")
      normal.re[index, 6] <- ifelse(shapiro.test(coeflist.re[[j]][ , i])$p.value > 0.05, "normal", "not normal")
      
      index <- index + 1
      print(j)
    }
    j <- j + 1
  }
}

# end of file.